import org.apache.spark.{SparkConf, SparkContext}

object WordCount  extends App {

  //  创建 SparkContext
  val sparkConf = new SparkConf()
  sparkConf.setAppName("spark-word-count-on-yarn")
  val sc = SparkContext.getOrCreate(sparkConf)
  //定义读取hdfs上数据的rdd
  val hdfsRdd = sc.textFile("hdfs://hadoop-master:9000/user/root/testdata/customers.csv")
  // 提取数据 first_name 和 last_name
  val firstNameAndLastNameRdd = hdfsRdd.map(line =>{
    val Array(_,first_name,last_name,_*)=line.split(",")
    (first_name,last_name)
  })
  //客户中的前5个最大家族
  firstNameAndLastNameRdd.map(x => {
    (x._2,1)
  }).reduceByKey(_+_).sortBy(_._2,false).take(5).foreach(println(_))
  //客户中的前10个最流行的名字
  firstNameAndLastNameRdd.map(x => {
    (x._1,1)
  }).reduceByKey(_+_).sortBy(_._2,false).take(10).foreach(println(_))

}
